#!/bin/ksh
# IBM_PROLOG_BEGIN_TAG 
# This is an automatically generated prolog. 
#  
#  
#  
# Licensed Materials - Property of IBM 
#  
# (C) COPYRIGHT International Business Machines Corp. 2000,2004 
# All Rights Reserved 
#  
# US Government Users Restricted Rights - Use, duplication or 
# disclosure restricted by GSA ADP Schedule Contract with IBM Corp. 
#  
# IBM_PROLOG_END_TAG 

# "@(#)97   1.26   src/rsct/pts/pam/config/linux_gpfs/cthatsctrl.sh, topology.services, rsct_rzauh, rzauh0431a 1/26/04 15:15:38"

#########################################################################
#                                                                       #
# Topology Services Daemon control script for Linux                     #
#                                                                       #
# Syntax:                                                               #
#     cthatsctrl [-n cluster_name] [-p port_number] -a | -d | -s | -k | #
#         -b | -r | -c | -z | -u | -t | -o | -h | -?                    #
#                                                                       #
#     -a add topology services                                          #
#     -b rebuild configuration. e.g. machines.lst file                  #
#     -c clean topology services(delete from all partitions)            #
#     -d delete topology services                                       #
#     -h print help message                                             #
#     -k stop topology services                                         #
#     -n override cluster name for debugging (Undocumented)             #
#     -o turn off tracing for topology services                         #
#     -p override port number                                           #
#     -r refresh topology services configuration                        #
#     -s start topology services                                        #
#     -t turn on tracing for topology services                          #
#     -u unconfigure topology services (from all partitions)            #
#     -z clean up topology services for de-installation                 #
#     -? print help message                                             #
#                                                                       #
#########################################################################
#                                                                       #
# Developer's note: It is obvious the following algorithm is easier and #
# faster than the one we are using in find_children():                  #
#                                                                       #
#   CHILD_PIDS=""                                                       #
#   ps -A -o pid,ppid | while read PID PPID; do                         #
#       if [[ $PPID = $PARENT_PID ]]; then                              #
#           CHILD_PIDS="$CHILD_PIDS $PID"                               #
#           print -u2 "$CHILD_PIDS"                                     #
#       fi                                                              #
#   done                                                                #
#   print -u2 "$CHILD_PIDS"                                             #
#                                                                       #
# However, the change of CHILD_PIDS made inside of the block that       #
# contains read, i.e. while loop, cannot be used outside of the block.  #
# The print statement inside the while loop will correctly prints all   #
# the PIDs whose PPID is $PARENT_PID. However, the print statement      #
# outside of the while loop will print nothing.                         #
#                                                                       #
# Test:change the CHILD_PIDS="" statement to CHILD_PIDS="XYZ". The print#
# statement inside the while loop prints something like "XYZ 123 456".  #
# It indicates the value of variable CHILD_PIDS did get into the while  #
# loop. However, the print statement outside of the while loop prints   #
# "XYZ". This indicates the value of CHILD_PIDS is restored when        #
# exiting the while loop. This problem happens only when read is used.  #
# This is a bug of ksh.                                                 #
#                                                                       #
# The following information is obtained from the BUGS session of the    #
# Linux ksh (pdksh) man page:                                           #
#                                                                       #
#   BTW, the most frequently reported bug is                            #
#       echo hi | read a; echo $a   # Does not print hi                 #
#   I'm aware of this and there is no need to report it.                #
#                                                                       #
#########################################################################

#=======================================================================#
#                                                                       #
# Function: get_set_env                                                 #
# Description: Set values for CLUSTER_NAME, HB_RUNDIR, HB_LOGDIR, and   #
#     HB_SOCKET environment variables and create run/, log/, and soc/   #
#     working directories.                                              #
#                                                                       #
#=======================================================================#

get_set_env() {
    if [[ ${SUBSYS} = ${OFFICIAL_NAME} ]]
    then
        # Unset environment variables to prevent overriding HATS configuration.
        unset HB_RUNDIR
        unset HB_LOGDIR
        unset HB_SOCKET
    fi
    
    if [[ -z "${CLUSTER_NAME}" ]]
    then
        # Get cluster name from ct_clusterinfo
        CLUSTER_NAME=`LC_ALL=C ${RSCTBIN}/ct_clusterinfo -c 2> /dev/null`
        if [[ -z "${CLUSTER_NAME}" ]]       # if ct_clusterinfo doesn't work
        then
            #print -u2 "Could not get cluster information from ${RSCTBIN}/ct_clusterinfo"
            print_message EMSG690 ${SCRIPT} ${RSCTBIN}/ct_clusterinfo
            exit 1
        fi
    fi
    
    HB_DIR=/var/ct/${CLUSTER_NAME}
    
    HB_LOGDIR=${HB_LOGDIR:-${HB_DIR}/log/${SUBSYSNAME}}
    HB_RUNDIR=${HB_RUNDIR:-${HB_DIR}/run/${SUBSYSNAME}}
    if [ -n "${HB_SERVER_SOCKET}" ]
    then
        HB_SOCDIR=${HB_SERVER_SOCKET%/server_socket}
    else
        HB_SOCDIR=${HB_DIR}/soc/${SUBSYSNAME}
    fi
    
    # strip off trailing / if present
    HB_LOGDIR=${HB_LOGDIR%/}
    HB_RUNDIR=${HB_RUNDIR%/}
    
    # Create the default log/run/soc dir
    if [[ ! -d ${HB_RUNDIR} ]]
    then
        mkdir -p ${HB_RUNDIR}
    fi
    if [[ ! -d ${HB_LOGDIR} ]]
    then
        mkdir -p ${HB_LOGDIR}
    fi
    if [[ ! -d ${HB_SOCDIR} ]]
    then
        mkdir -p ${HB_SOCDIR}
    fi
}

#=======================================================================#
#                                                                       #
# Function: wait_for_my_turn                                            #
# Description: Sequentialize running multiple instances of this program #
#     For example, if a user issues add and delete in two windows, we   #
#     want to do either "delete" after "add" is completely finished or  #
#     "add" after "delete" is completely finished (which we will print  #
#     error messages and exit for "delete") but not "delete" in the     #
#     middle of "add" or "add" in the middle of "delete".               #
#                                                                       #
#     Command lockfile can be used to create one or more semaphore      #
#     files. Command lockfile returns failure and removes all the files #
#     it created up till that point if it cannot create all files it    #
#     is asked to create. All files created by command lockfile will    #
#     be read-only, and therefore will have to be removed with rm -f.   #
#                                                                       #
#     The serialization only work when lockfile exists. Without         #
#     lockfile, the script does not do serialization, which is the      #
#     same as the old beheavior.                                        #
#                                                                       #
# Input:                                                                #
#     TIMEOUT: optional time out in seconds.                            #
#                                                                       #
# Return code:                                                          #
#     0 : get through serialization or serialization not supported      #
#     1 : can not get through before time out                           #
#                                                                       #
#=======================================================================#

wait_for_my_turn() {
    # No serialization if lockfile does not exist.
    if [[ -x /usr/bin/lockfile ]]
    then
        if [[ -n "${1}" ]]
        then
            TIMEOUT_wait_for_my_turn=${1}
        else
            TIMEOUT_wait_for_my_turn=$DFLT_TIMEOUT
        fi
        # Test and create the lock file. No retry (-r 0). Lock files
        # older than 60 seconds are considered obsolete (-l 60). Wait for
        # 1 second before creating the lock file if an obsolete
        # lock file is removed forcibly (-s 1).
        while ! lockfile -r 0 -l 60 -s 1 ${LOCKFILE} > /dev/null 2>&1
        do
            (( TIMEOUT_wait_for_my_turn = ${TIMEOUT_wait_for_my_turn} - 1 ))
            if (( ${TIMEOUT_wait_for_my_turn} == 0 ))
            then
                # Cannot create lock file before timeout. Return "not sucess".
                return 1
            fi
            sleep 1
        done
    fi
    # Either the system does not support lockfile or we created the 
    # lock file successfully. 
    return 0
}

clean_up_ticket() {
    # Lock files created by lockfile are read-only. They have to be
    # removed with rm -f.
    rm -f ${LOCKFILE} > /dev/null 2>&1
}

#=======================================================================#
#                                                                       #
# Function: chk_subsys_state                                            #
# Description: Check if the subsystem has a PID in src status line.     #
#     We had problems parsing lssrc outputs in NLS enabled environment. #
#     The title line is printed in the locale that lssrc is run in.     #
#     However, the rest output lines, i.e. subsystem status lines,      #
#     are printed in the locale that the srcmstr daemon is run in.      #
#     This behavior may change in the future. We would like to derive   #
#     a method that works no matter if this behavior changes or not.    #
#                                                                       #
#     One thing never changed: active subsystems have a PID. A PID      #
#     consists of only decimal digits which reside in the unique code   #
#     range and never conflicts with any human language. The last field #
#     "active" or "inoperative", on the other hand, never consists of   #
#     only decimal digits in any human language. Hence, in most cases,  #
#     we can consider a subsystem with a PID in its src status line     #
#     "active". Actually, a subsystem in "waiting to stop" can have a   #
#     PID, too. For our purpose, it means that the subsystem is not dead#
#     and can be treated as "active".                                   #
#                                                                       #
# Input:                                                                #
#    SUBSYSNAME: subsystem name to be checked                           #
#    Group_name: optional                                               #
#        subsystem group name: both subsystem and group name need match #
#        empty               : the subsystem does not have a group name #
#        "-"                 : don't care. match subsystem name only    #
#                                                                       #
# Output:                                                               #
#    SUBSYS_PID: contains the PID of the subsystem if it is active or   #
#        0 if the subsystem is not active.                              #
#                                                                       #
# Return code:                                                          #
#    0: $1 is in src and has a PID                                      #
#    1: $1 is in src but not active                                     #
#    2: $1 is not in src                                                #
#                                                                       #
#=======================================================================#

chk_subsys_state() {
    #set -x
    STATE=`LC_ALL=C lssrc -s ${1} 2> /dev/null`
    if [[ $? -ne 0 ]]
    then
        RETURN_CODE=2
    else
        STATE=`echo "$STATE" | LC_ALL=C grep "${1}" 2> /dev/null`
        # The following sed command matches the status lines of the pattern:
        # a space followed by the subsystem name ("${1}") followed by some
        # spaces ("  *") followed by the subsystem group name ("${2}") or
        # any non-space characters ("[^ ][^ ]*") if the subsystem group name
        # is "-" or no subsystem group name at all if it is not given
        # followed by some spaces ("  *") followed by a PID ("[0-9][0-9]*").
        # Note that the regular expression recorgnized by sed does not handle
        # "+" well. "cc*" needs to be used when we need to use "c+".
        if [[ -n "${2}" ]]
        then
            if [[ "${2}" = "-" ]]
            then
                SUBSYS_PID=`echo "${STATE}" | \
                    LC_ALL=C sed "s/\(^ ${1}  *[^ ][^ ]*  *\)\([0-9][0-9]*\)\( .*\)/\2/"`
            else
                SUBSYS_PID=`echo "${STATE}" | \
                    LC_ALL=C sed "s/\(^ ${1}  *${2}  *\)\([0-9][0-9]*\)\( .*\)/\2/"`
            fi
        else
            SUBSYS_PID=`echo "${STATE}" | \
                LC_ALL=C sed "s/\(^ ${1}  *\)\([0-9][0-9]*\)\( .*\)/\2/"`
        fi
        # sed always returns 0 for s/// function no matter it find
        # the pattern to be replaced or not in both Linux and AIX.
        # It just prints out the origional line, i.e. replace nothing,
        # if the pattern to be replaced is not found.
        if [[ "X${SUBSYS_PID}" = "X${STATE}" ]]
        then
            RETURN_CODE=1
        else
            RETURN_CODE=0
        fi
    fi
    if [[ ${RETURN_CODE} -ne 0 ]]
    then
        SUBSYS_PID=0
    fi
    return $RETURN_CODE
}

#=======================================================================#
#                                                                       #
# Function: wait_process                                                #
# Description: Wait for processes to die.                               #
# Input:                                                                #
#     TIMEOUT_wait_process: time out in seconds                         #
#     PID_wait_process: PIDs to wait for.                               #
#                                                                       #
# Return code:                                                          #
#     0 : success                                                       #
#     1 : the processes did not die before time out                     #
#                                                                       #
#=======================================================================#

wait_process() {
    TIMEOUT_wait_process=${1}
    shift
    PID_wait_process=""
    while [ -n "${1}" ]
    do
        PID_wait_process="$PID_wait_process -p ${1}"
        shift
    done

    RC_wait_process=0
    if [ -n "${PID_wait_process}" ]
    then
        while (( $TIMEOUT_wait_process > 0 ))
        do
            ps ${PID_wait_process} > /dev/null 2>&1
            if [[ $? -ne 0 ]]   # ps returns non-0 means the PID disappeared
            then
                break
            fi
            sleep 1
            (( TIMEOUT_wait_process = ${TIMEOUT_wait_process} - 1 ))
        done
        ps ${PID_wait_process} > /dev/null 2>&1
        if [[ $? -eq 0 ]]
        then
            RC_wait_process=1
        fi
    fi
    return $RC_wait_process
}

#=======================================================================#
#                                                                       #
# Function: find_children                                               #
# Description: Find the child processes of a process.                   #
#                                                                       #
# Input:                                                                #
#     PARENT_PID : The parent process PID.                              #
#                                                                       #
# Output:                                                               #
#     CHILD_PIDS: The child PIDs separated by a space character.        #
#                                                                       #
# Return code: None                                                     #
#                                                                       #
#=======================================================================#

find_children() {
    PARENT_PID=${1}
    if [ -n "${PARENT_PID}" ]
    then
        # See Developer's note in the beginning of the file for the
        # following code.
        # The sed command matches: any number of spaces, followed by
        # one or more digits (PID), followed by one or more spaces,
        # followed by ${PARENT_PID}. The matched string is replaced
        # by the 2'nd match sub-expression, PID.
        CHILD_PIDS=`LC_ALL=C ps -A -o pid,ppid | \
            LC_ALL=C grep "^.* ${PARENT_PID}$" | \
            LC_ALL=C sed "s/\(^ *\)\([1-9][0-9]*\)\(  *${PARENT_PID}$\)/\2/"`
        # Convert multi-line PIDs into single line.
        # Note: ${CHILD_PIDS}, not "${CHILD_PIDS}"!
        CHILD_PIDS=`echo ${CHILD_PIDS}`
    fi
}

#=======================================================================#
#                                                                       #
# Function: kill_process                                                #
# Description: Kill a process and all its descendents.                  #
#     Linux implements threads like processes. lssrc shows the PID      #
#     of the main thread in its subsystem status lines. The pthread     #
#     library forks a thread manager process when pthread_create() is   #
#     called the first time. The thread manager is the child of the     #
#     main thread. When pthread_create() is called afterward, the       #
#     pthread library forks new threads from the thread manager and     #
#     makes all other threads child processes of the thread manager.    #
#     The HATS pluggable NIMs are forked from the main thread. The      #
#     relationship of the HATS threads looks like:                      #
#                                                                       #
#     MAIN thread                                                       #
#         |--- Thread manager                                           #
#         |       |--- HATS thread                                      #
#         |       |--- HATS thread                                      #
#         |                                                             #
#         |--- NIM process MAIN thread                                  #
#         |       |--- NIM thread manager                               #
#         |               |--- NIM thread                               #
#         |               |--- NIM thread                               #
#         |               |--- NIM thread                               #
#         |                                                             #
#         |--- NIM process                                              #
#                                                                       #
#     To kill all HATS threads, we need to find all processes whose     #
#     parent and grand parent is the main thread.                       #
#                                                                       #
#     Other operating systems, e.g. AIX, implement threads inside a     #
#     process. All threads are killed when the associated process is    #
#     killed. The NIM will detect the HATS daemon died and exit.        #
#                                                                       #
#     Note this subroutine is usually called when stopsrc cannot stop a #
#     subsystem in a given time out. In a slow or heavily loaded system,#
#     it is possible that the process is stopped after time out expires #
#     and the threads to be killed disappears before we physically kill #
#     them.                                                             #
#     "kill -KILL" is the strongest command to kill a process in UNIXs. #
#     It is unlikely to fail on a normal system. Should it fail and     #
#     cause the subroutine returns 1, the system must be running into   #
#     a serious problem and may need to be rebooted.                    #
#                                                                       #
# Input:                                                                #
#     ${1}: PID of the subsystem to be killed                           #
#                                                                       #
# Return code:                                                          #
#     0 : success                                                       #
#     1 : the processes did not die before time out                     #
#                                                                       #
#=======================================================================#

kill_process() {
    #set -x
    NEXT_GENERATION=${1}
    SUBSYS_PID_ALL=${NEXT_GENERATION}
    while [ -n "${NEXT_GENERATION}" ]
    do
        CURR_GENERATION=${NEXT_GENERATION}
        NEXT_GENERATION=""
        for i in ${CURR_GENERATION}
        do
            find_children ${i}      # child PIDs returned in $CHILD_PIDS
            # Need to check empty string here because the while loop
            # requires $NEXT_GENERATION to be an empty string to exit.
            if [ -n "${CHILD_PIDS}" ]
            then
                NEXT_GENERATION="${NEXT_GENERATION} ${CHILD_PIDS}"
            fi
        done
        SUBSYS_PID_ALL="${SUBSYS_PID_ALL} ${NEXT_GENERATION}"
    done

    # Important debugging statement. Print all processes to be killed.
    #ps -o pid,ppid,ucomm ${SUBSYS_PID_ALL}

    RC_kill_process=0
    if [ -n "${SUBSYS_PID_ALL}" ]
    then
        kill -KILL ${SUBSYS_PID_ALL} 1> /dev/null 2>&1
        wait_process $DFLT_TIMEOUT ${SUBSYS_PID_ALL}
        RC_kill_process=$?
    fi
    return $RC_kill_process
}

#=======================================================================#
#                                                                       #
# Function: add_subsys                                                  #
# Description: Add the $SUBSYSNAME subsystem to src. If the subsystem   #
#     exists, remove the old one and make a new one.                    #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0: sucess                                                         #
#     1: not success                                                    #
#                                                                       #
#=======================================================================#

add_subsys() {
    # Check if the subsystem is stopped.
    chk_subsys_state ${SUBSYSNAME} "-"
    if [[ $? -eq 0 ]]
    then
        #print -u2 "The ${SUBSYSNAME} subsystem must be stopped before removing
        # or re-making it."
        print_message EMSG636 ${SCRIPT} ${SUBSYSNAME}
        return 1
    fi

    # Get the HATS subsystem configuration from ct_hats_info.
    CT_HATS_INFO_DATA=`LC_ALL=C ${RSCTBIN}/ct_hats_info`

    if [[ -z "${PORT_NUMBER}" ]]
    then
        PORT_NUMBER=`echo "${CT_HATS_INFO_DATA}" | \
            LC_ALL=C grep -w "PORT" | \
            LC_ALL=C sed "s/\(^[$SPACE_CHARS]*PORT[$SPACE_CHARS][$SPACE_CHARS]*\)\([0-9][0-9]*\).*$/\2/"`
        if [[ -z ${PORT_NUMBER} ]]
        then
            #print -u2 "Port number not found by ct_hats_info command."
            print_message EMSG691 ${SCRIPT} ${RSCTBIN}/ct_hats_info
            return 1
        fi
    fi

    # Set the src restart flag based on the RESTART_BY_SRC variable
    # provided by ct_hats_info. Flag "-R" means src will restart HATS
    # subsystem if HATS subsystem died. "-O" means src will not restart
    # HATS subsystem. 
    RESTART_BY_SRC=`echo "${CT_HATS_INFO_DATA}" | \
        LC_ALL=C grep -w "RESTART_BY_SRC" | \
        LC_ALL=C sed "s/\(^[$SPACE_CHARS]*RESTART_BY_SRC[$SPACE_CHARS][$SPACE_CHARS]*\)\([0-9][0-9]*\).*$/\2/"`
    if [[ -n ${RESTART_BY_SRC} ]]
    then
        # Valid RESTART_BY_SRC values are 0 and 1. Non-0 RESTART_BY_SRC
        # values are considered 1. i.e. to restart HATS subsystem.
        if (( ${RESTART_BY_SRC} == 0 ))
        then
            RESTART_BY_SRC_FLAG="-O"
        else
            RESTART_BY_SRC_FLAG="-R"
        fi
    else
        # RESTART_BY_SRC is not provided by ct_hats_info. Default is to restart.
        RESTART_BY_SRC_FLAG="-R"
    fi

    updservices -s ${SERVICENAME} -p ${PORT_NUMBER} -t udp
    rc=$?
    if [[ $rc != 0 ]]
    then
        #print -u2 "Cannot set port number into /etc/services for service ${SERVICENAME}"
        print_message EMSG638 ${SCRIPT} ${SERVICENAME}
        return 1
    fi

    rmssys -s ${SUBSYSNAME} >/dev/null 2>&1 # ensure the subsystem is out of SRC

    # Make HATS src subsystem entry. Subsystem name=$SUBSYSNAME,
    # subsystem group name=$SUBSYS, subsystem program path=$RSCTBIN/$SUBSYS
    # subsystem should run under uid=0 (root), the standard output and
    # error should be redirect to ${HB_LOGDIR}/${SUBSYS}.${CLUSTER_NAME},
    # the subsystem should be restarted if it stops abnormally
    # (RESTART_BY_SRC_FLAG=-R) or not (RESTART_BY_SRC_FLAG=-O),
    # multiple instances of the subsystem are not allowed to run at
    # the same time (-Q), the subsystem uses sockets as its communication
    # method (-K), inactive subsystems are displayed when the lssrc -a
    # command request is made (-d), wait 30 seconds before send a SIGKILL
    # after a SIGTERM is sent and also wait 30 seconds before restart
    # the subsystem again (-w 30).
    mkssys -s ${SUBSYSNAME} -p ${RSCTBIN}/${SUBSYS} -u 0 \
        -o ${HB_LOGDIR}/${SUBSYS}.${CLUSTER_NAME} \
        -e ${HB_LOGDIR}/${SUBSYS}.${CLUSTER_NAME} \
        ${RESTART_BY_SRC_FLAG} -Q -K -d -w 30 -G ${SUBSYS}
    if [[ $? -ne 0 ]]
    then
        #print -u2 "Cannot add ${SUBSYSNAME} subsystem to SRC"
        print_message EMSG641 ${SCRIPT} ${SUBSYSNAME}
        return 1
    fi

    # All the necessary steps have been taken to add the subsystem.
    # Finally, vfyservices is called to verify the addition to the 
    # /etc/services file. The vfyservices function verifies that
    # programs using getservbyname() and getservbyport() will actually
    # see the information this script has put into /etc/services.
    # If this fails, and NIS is in use, the system administrator may need
    # to update /etc/services on the NIS master server.

    vfyservices -s ${SERVICENAME} -t udp -p ${PORT_NUMBER}
    if [[ $? -ne 0 ]]
    then
        #print -u2 "Cannot verify registration of service name ${SERVICENAME}, protocol udp, port ${PORT_NUMBER}."
        print_message EMSG644 ${SCRIPT} ${SERVICENAME} ${PORT_NUMBER}
        return 1
    fi

    if [[ "X${OSNAME}" = "XLinux" ]]
    then
        ${RSCTBIN}/ctinitmgr -a ${RSCTBIN}/rc.${SUBSYSNAME} ${SUBSYSNAME}
    fi
    return 0
}

#=======================================================================#
#                                                                       #
# Function: wait_stop_subsys                                            #
# Description: Wait for a subsystem to stop                             #
#                                                                       #
# Input:                                                                #
#     SUBSYS_TO_WAIT: subsystem to wait                                 #
#     TIMEOUT: optional timeout in seconds                              #
#                                                                       #
# Return code:                                                          #
#     0: ${SUBSYS_TO_WAIT} is stopped                                   #
#     1: ${SUBSYS_TO_WAIT} is not fully stopped before the given timeout#
#                                                                       #
#=======================================================================#

wait_stop_subsys() {
    if [[ -n "${2}" ]]
    then
        TIMEOUT=$DFLT_TIMEOUT
    else
        TIMEOUT=${2}
    fi
    chk_subsys_state ${1} "-"
    if [[ $? -eq 0 ]]
    then
        RC_wait_stop_subsys=1
        (( PRINT_WAIT = 3 ))    # Print a "waiting" every 3 seconds
        (( i = ${TIMEOUT} ))
        while (( $i > 0 ))
        do
            if (( $i < $PRINT_WAIT ))
            then
                (( PRINT_WAIT=${i} ))
            fi
            wait_process $PRINT_WAIT ${SUBSYS_PID}
            if [[ $? -eq 0 ]]
            then
                RC_wait_stop_subsys=0
                break;
            fi
            (( i = ${i} - $PRINT_WAIT ))
            #print -u2 "Waiting for ${1} subsystem to stop. ${i} seconds remaining."
            print_message I_CthatsctrlWaitForStop ${1} "${i}"
        done
    else
        # Subsystem which is not active or does not exist is considered stopped.
        RC_wait_stop_subsys=0
    fi
    return ${RC_wait_stop_subsys}
}

#=======================================================================#
#                                                                       #
# Function: stop_subsys                                                 #
# Description: Stop the $SUBSYSNAME subsystem                           #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0: sucess                                                         #
#     1: not success                                                    #
#                                                                       #
#=======================================================================#

stop_subsys() {
    RC_stop_subsys=0
    # Check if the subsystem is running before stopping it. 0 should
    # be returned if the subsystem is active.
    chk_subsys_state ${SUBSYSNAME} "-"
    if [[ $? -eq 0 ]]
    then
        stopsrc ${STOPSRC_OPT} ${SUBSYSNAME}
        wait_stop_subsys ${SUBSYSNAME} $DFLT_TIMEOUT
        if [[ $? -ne 0 ]]
        then
            if [[ "X${OSNAME}" = "XLinux" ]]
            then
                # Check the subsystem again to get the subsystem PID
                chk_subsys_state ${SUBSYSNAME} "-"
                if [[ $? -eq 0 ]]
                then
                    #print -u2 "Sending SIGKILL to kill ${SUBSYSNAME} subsystem."
                    print_message I_KillingSubsys ${SUBSYSNAME}
                    kill_process ${SUBSYS_PID}
                    if [[ $? -ne 0 ]]
                    then
                        #print -u2 "Warning: ${SUBSYSNAME} subsystem is not
                        #    fully stopped in $DFLT_TIMEOUT seconds"
                        print_message I_TopsvcsctrlDaemonNotStop ${SUBSYSNAME} ${DFLT_TIMEOUT}
                        RC_stop_subsys=1
                    fi
                else
                    # On a slow or heavily loaded system, it is possible
                    # the subsystem does not stop until now. Since it stops
                    # now, we do not need to do anything.
                    # The following ":" is a null command. Do not remove it.
                    :
                fi
            else
                # AIX 
                #print -u2 "Warning: ${SUBSYSNAME} is not fully stopped in ${DFLT_TIMEOUT} seconds."
                print_message I_TopsvcsctrlDaemonNotStop ${SUBSYSNAME} ${DFLT_TIMEOUT}
                RC_stop_subsys=1
            fi
        fi
    fi
    # ctinitmgr -k should be called when the subsystem is not active
    # no matter if it is stopped by this script or not.
    if [[ "X${OSNAME}" = "XLinux" && $RC_stop_subsys -eq 0 ]]
    then
        # Some Linux distributions, e.g. RedHat, need to keep tracking
        # which service is up to make it stopped automatically at
        # machine shutdown time.
        ${RSCTBIN}/ctinitmgr -k ${SUBSYSNAME}
    fi
    return $RC_stop_subsys
}

#=======================================================================#
#                                                                       #
# Function: del_subsys                                                  #
# Description: delete the $SUBSYSNAME subsystem from src.               #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0: sucess                                                         #
#     1: not success                                                    #
#                                                                       #
#=======================================================================#

del_subsys() {
    # Check if the subsystem is running before deleting it.
    chk_subsys_state ${SUBSYSNAME} "-"
    if [[ $? -eq 0 ]]
    then
        #print -u2 "The ${SUBSYSNAME} subsystem must be stopped before removing
        # or re-making it."
        print_message EMSG636 ${SCRIPT} ${SUBSYSNAME}
        return 1
    fi

    rmssys -s ${SUBSYSNAME} >/dev/null 2>&1     # take subsystem out of SRC

    if [[ "X${OSNAME}" = "XLinux" ]]
    then
        ${RSCTBIN}/ctinitmgr -d ${SUBSYSNAME}
    fi

    remservices -s ${SERVICENAME} -t udp     # take port from /etc/services

    return $?
}

#=======================================================================#
#                                                                       #
# Function: rebuild_subsys                                              #
# Description: rebuild HATS configuration file machines.lst             #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0: sucess                                                         #
#     1: not success                                                    #
#                                                                       #
#=======================================================================#

rebuild_subsys() {
    # Call "cthats -b" to rebuild machines.lst configuration file.
    ${RSCTBIN}/${SUBSYS} -b
    if [[ $? -eq 0 ]]
    then
        RETURN_CODE=0
    else
        # "cthats -b" should have produced an error message describing
        # the problem when returning a non-0 exit code. No error message
        # to display here.
        RETURN_CODE=1
    fi
    return ${RETURN_CODE}
}

#=======================================================================#
#                                                                       #
# Function: refresh_subsys                                              #
# Description: refresh HATS subsystem                                   #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0: sucess                                                         #
#     1: not success                                                    #
#                                                                       #
#=======================================================================#

refresh_subsys() {
    ${RSCTBIN}/${SUBSYS} -r > ${HB_LOGDIR}/refreshOutput 2>&1
    if [[ $? -eq 0 ]]
    then
        RETURN_CODE=0
    else
        #print -u2 "Error refreshing ${SUBSYS} subsystem. See detail in ${HB_LOGDIR}/refreshOutput"
        print_message EMSG852 ${SCRIPT} ${HB_LOGDIR}/refreshOutput
        RETURN_CODE=1
    fi
    return ${RETURN_CODE}
}

#=======================================================================#
#                                                                       #
# Function: clean_subsys                                                #
# Description: stop the $SUBSYSNAME subsystem and remove it from src    #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0 : success                                                       #
#     1 : remservices cannot take the subsystem out of /etc/services    #
#                                                                       #
#=======================================================================#

clean_subsys() {
    chk_subsys_state ${SUBSYSNAME} "-"
    RC_clean_subsys=$?
    if [[ $RC_clean_subsys -eq 0 || $RC_clean_subsys -eq 1 ]]
    then
        # Stop the subsystem first. 
        stop_subsys
        # delete the subsystem
        del_subsys
        RC_clean_subsys=$?
    else
        # $SUBSYSNAME is not in SRC. No need to clean anything.
        RC_clean_subsys=0
    fi

    return $RC_clean_subsys
}

#=======================================================================#
#                                                                       #
# Function: unconfig_subsys                                             #
# Description: stop the $SUBSYSNAME subsystem and remove it from src    #
#     and the repository                                                #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0 : success                                                       #
#     1 : can not unconfig the subsystem                                #
#                                                                       #
#=======================================================================#

unconfig_subsys() {
    clean_subsys
    if [[ $? -eq 0 ]]
    then
        # We haven't decided how to deal with this in Linux clusters yet.
        return 0
    else
        return 1
    fi
}

#=======================================================================#
#                                                                       #
# Function: deinstall_subsys                                            #
# Description: stop the $SUBSYSNAME subsystem and remove everything     #
#     it created at run time for de-installing it                       #
#                                                                       #
# Input: None                                                           #
# Return code:                                                          #
#     0 : success                                                       #
#     1 : can not take the subsystem out of the system                  #
#                                                                       #
#=======================================================================#

deinstall_subsys() {
    clean_subsys
    if [[ $? -eq 0 ]]
    then
        rm -rf ${HB_LOGDIR} 2> /dev/null
        rm -rf ${HB_RUNDIR} 2> /dev/null
        rm -rf ${HB_SOCDIR} 2> /dev/null

        return 0
    else
        return 1
    fi
}

#########################################################################
#                                                                       #
# NLS related subroutines                                               #
#                                                                       #
#########################################################################

#=======================================================================#
#                                                                       #
# Function: get_locale_env                                              #
# Description: set variable LOCALE_ENV according to the locale          #
#       environment variables.                                          #
#                                                                       #
#=======================================================================#

get_locale_env() {
    LOCALE_ENV=""
    if [ -n "$LANG" ]
    then
        LOCALE_ENV="$LOCALE_ENV LANG=$LANG"
    fi
    if [ -n "$LC_COLLATE" ]
    then
        LOCALE_ENV="$LOCALE_ENV LC_COLLATE=$LC_COLLATE"
    fi
    if [ -n "$LC_CTYPE" ]
    then
        LOCALE_ENV="$LOCALE_ENV LC_CTYPE=$LC_CTYPE"
    fi
    if [ -n "$LC_MONETARY" ]
    then
        LOCALE_ENV="$LOCALE_ENV LC_MONETARY=$LC_MONETARY"
    fi
    if [ -n "$LC_NUMERIC" ]
    then
        LOCALE_ENV="$LOCALE_ENV LC_NUMERIC=$LC_NUMERIC"
    fi
    if [ -n "$LC_TIME" ]
    then
        LOCALE_ENV="$LOCALE_ENV LC_TIME=$LC_TIME"
    fi
    if [ -n "$LC_MESSAGES" ]
    then
        LOCALE_ENV="$LOCALE_ENV LC_MESSAGES=$LC_MESSAGES"
    fi
    if [ -n "$LC_ALL" ]
    then
        LOCALE_ENV="$LOCALE_ENV LC_ALL=$LC_ALL"
    fi
    # `echo $LOCALE_ENV` (with no '"' surrounding $LOCALE_ENV) prints the
    # contents of $LOCALE_ENV one by one with one space between items.
    # Leading and trailing blanks are deleted.
    LOCALE_ENV=`echo $LOCALE_ENV`
}

#=======================================================================#
#                                                                       #
# Function: print_message                                               #
# Description: wrapper for message printing. All messages go to stderr. #
#                                                                       #
#=======================================================================#

print_message() {
    print -u2 "`${MSGCMD} $*`"
}

#########################################################################
#                                                                       #
# cthatsctrl.sh main program starts here.                               #
#                                                                       #
#########################################################################

#set -x
OSNAME=`uname -s 2> /dev/null`
MACHNAME=`uname -m 2> /dev/null`
CONTROL_SFX="ctrl"              # 
OFFICIAL_NAME="cthats"          # Debugging only when not using official name
RSCTBIN=/usr/sbin/rsct/bin
export PATH=/bin:/sbin:/usr/bin:/usr/sbin:${RSCTBIN}
export MSGMAPPATH=/usr/sbin/rsct/msgmaps

SPACE_CHARS=" 	"               # space and tab
# default time out for wait_stop_subsys() and wait_for_my_turn()
DFLT_TIMEOUT=60
SCRIPT=$(basename $0)
SUBSYS=${SCRIPT%${CONTROL_SFX}} # name of subsystem group
                                # $SUBSYS equals $SCRIPT without a trailing
                                # "ctrl" or $SCRIPT if $SCRIPT does not have
                                # a trailing "ctrl"
if [[ "${SCRIPT}" = "${SUBSYS}" ]]
then
    # The official name of this program is cthatsctrl. The execution
    # flow should never get here unless the script is given a different
    # name. This checking is for developers' use only. No need to use
    # print_message() for this message.
    print -u2 "This control program must be called xxxxctrl."
    exit 1
fi

SUBSYSNAME=${SUBSYS}            # name of subsystem on a node. $SUBSYS + partition name
# The following name is used in mkssys command to provide an alias of
# the subsysname so older programs can still use "lssrc -s hats" to get
# the status of cthats.
#
#     # lssrc -s hats
#     Subsystem         Group            PID     Status 
#      cthats           cthats           16186   active
#
#ALT_SUBSYSNAME=hats		# alternate subsystem name

# set service name for adding port to /etc/services
SERVICENAME=${SUBSYS}

LOCKFILE=/tmp/${SCRIPT}.lock    # Set serialization lock file name

SIGINT=2
SIGTERM=15

# Unless in debugging environment (sub-system name is not "cthats") and 
# a matched message map file exists, hats.cat is used to display messages.
if [[ ${SUBSYS} != ${OFFICIAL_NAME} && -r $MSGMAPPATH/${SUBSYS}.script.map ]]
then
    CATFILE=${SUBSYS}.cat
else
    CATFILE=hats.cat
fi
MSGCMD="${RSCTBIN}/ctdspmsg script $CATFILE"

# Set variable LOCALE_ENV to pass the values of current locale environment
# variables for startsrc command.
get_locale_env

# Due to Linux signal handling problem in multi-thread environments,
# stopsrc "-c" option, which sends signals to the daemon, does not
# work correctly for HATS daemon. We use "stopsrc -s cthats" in Linux.
if [[ "X${OSNAME}" = "XLinux" ]]
then
    STOPSRC_OPT="-s"
else
    STOPSRC_OPT="-c -s"
fi

#
# Parse command line options
#

Tracelist=
Argcount=0
# Do not include -n option as it is an un-documented internal debugging option.
#USAGE="\
#$0 -a | -c | -d | -e | -k | -o | -p | -r | -s | -t | -u | -z | -h | -? \n
#-a      add topology services \n
#-c      clean topology services(delete from all partitions) \n
#-d      delete topology services \n
#-e      start topology services with the Dead Man Switch enabled \n
#-k      stop topology services \n
#-o      turn off tracing for topology services \n
#-p      override port number \n
#-r      refresh topology services configuration \n
#-s      start topology services \n
#-t      turn on tracing for topology services \n
#-u      unconfigure topology services (from all partitions) \n
#-z      clean up topology services for de-installation \n
#-h|?    this help message\n"

while getopts ":adsekbrcuztop:n:h?" opt
do
    case $opt in
    a ) Op=add
        Argcount=$((Argcount + 1));;
    d ) Op=delete
        Argcount=$((Argcount + 1));;
    s ) Op=start
        Argcount=$((Argcount + 1));;
    e ) Op=start_with_DMS
        Argcount=$((Argcount + 1));;
    k ) Op=stop
        Argcount=$((Argcount + 1));;
    b ) Op=rebuild
        Argcount=$((Argcount + 1));;
    r ) Op=refresh
        Argcount=$((Argcount + 1));;
    c ) Op=clean
        Argcount=$((Argcount + 1));;
    u ) Op=unconfig
        Argcount=$((Argcount + 1));;
    z ) Op=deinstall
        Argcount=$((Argcount + 1));;
    t ) Op=traceon
        Tracelist="$OPTARG"
        Argcount=$((Argcount + 1));;
    o ) Op=traceoff
        Tracelist="$OPTARG"
        Argcount=$((Argcount + 1));;
    p ) PORT_NUMBER=${OPTARG} ;;        # Do not increase Argcount
    n ) CLUSTER_NAME=${OPTARG} ;;       # Do not increase Argcount
    h ) Op=print_usage
        Argcount=$((Argcount + 1));;
    ? )
        #print -u2 "Unknown command line option ${OPTARG}\n${USAGE}"
        print_message EMSG680 ${SCRIPT} ${OPTARG}
        print_message I_CthatsctrlUsage ${SCRIPT}
        exit 1;;
    esac
done

# Check for valid flags and/or arguments

if ((Argcount == 0))
then
    #print -u2 "Missing command flag\n${USAGE}"
    print_message EMSG681 ${SCRIPT}
    print_message I_CthatsctrlUsage ${SCRIPT}
    exit 1
fi

if ((Argcount > 1))
then
    #print -u2 "Only one command line option is permitted."
    print_message EMSG682 ${SCRIPT}
    print_message I_CthatsctrlUsage ${SCRIPT}
    exit 1
fi

#
# Actions based on the given options.
#

if [[ ${Op} = print_usage ]]
then
    #print -u2 ${USAGE}
    print_message I_CthatsctrlUsage ${SCRIPT}
    exit 0
fi

if [[ ${Op} = add || ${Op} = delete || ${Op} = clean || ${Op} = unconfig \
    || ${Op} = deinstall ]]
then
    # Get common shell functions, e.g. updservices. We will use updservices until
    # a more robust method of updating /etc/services is available.
    if [[ -r ${RSCTBIN}/ct_etc_functions ]]
    then
        . ${RSCTBIN}/ct_etc_functions
    else
        if [[ -e ${RSCTBIN}/ct_etc_functions ]]
        then
            #print -u2 "${RSCTBIN}/ct_etc_functions is not readable"
            print_message EMSG867 ${SCRIPT} ${RSCTBIN}/ct_etc_functions
        else
            #print -u2 "${RSCTBIN}/ct_etc_functions does not exist"
            print_message EMSG866 ${SCRIPT} ${RSCTBIN}/ct_etc_functions
        fi
    fi
fi

# Sequentialize running multiple instances of this program.
wait_for_my_turn $DFLT_TIMEOUT

if [[ $? -eq 0 ]]
then
    EXIT_CODE=0
    case $Op in
    add )           # Add the subsystem to the SRC
        get_set_env
        add_subsys
        EXIT_CODE=$?
        ;;
    
    delete )        # Delete the subsystem from the SRC.
        del_subsys
        EXIT_CODE=$?
        ;;
    
    start )         # start the subsystem
        startsrc -e "$LOCALE_ENV" -s ${SUBSYSNAME}
        EXIT_CODE=$?
        if [[ "X${OSNAME}" = "XLinux" && $EXIT_CODE -eq 0 ]]
        then
            # Some Linux distributions, e.g. RedHat, need to keep tracking
            # which service is up to make it stopped automatically at
            # machine shutdown time.
            ${RSCTBIN}/ctinitmgr -s ${SUBSYSNAME}
        fi
        ;;
    
    start_with_DMS )# start the subsystem with the Dead Man Switch enabled
        startsrc -e "$LOCALE_ENV" -a "-e" -s ${SUBSYSNAME}
        EXIT_CODE=$?
        if [[ "X${OSNAME}" = "XLinux" && $EXIT_CODE -eq 0 ]]
        then
            # Some Linux distributions, e.g. RedHat, need to keep tracking
            # which service is up to make it stopped automatically at
            # machine shutdown time.
            ${RSCTBIN}/ctinitmgr -s ${SUBSYSNAME}
        fi
        ;;
    
    stop )          # stop the subsystem
        stop_subsys
        EXIT_CODE=$?
        ;;
    
    rebuild )       # rebuild machines.lst and put it into the repository
        rebuild_subsys
        EXIT_CODE=$?
        ;;
    
    refresh )       # refresh the configuration
        get_set_env
        refresh_subsys
        EXIT_CODE=$?
        ;;
    
    clean )         # clean up the subsystem to restore it to a known state
        clean_subsys
        EXIT_CODE=$?
        ;;
    
    unconfig )      # clean up the subsystem to restore it to a known state
                    # and remove the subsystem from the repositoy
        unconfig_subsys
        EXIT_CODE=$?
        ;;
    
    deinstall )     # clean up the subsystem for de-installing
                    # This function is used in unpost_i.sh to clean up all
                    # stuff created at run-time before de-install the
                    # Topology Services subsystem.
        get_set_env
        deinstall_subsys
        EXIT_CODE=$?
        ;;
    
    traceon )       # turn on all tracing in daemon
        traceson -s ${SUBSYSNAME}
        EXIT_CODE=$?
        ;;
    
    traceoff )      # turn off all tracing in daemon for the system partition
        tracesoff -s ${SUBSYSNAME}
        EXIT_CODE=$?
        ;;
    
    esac
    clean_up_ticket 
else
    #print -u2 "There are other $SCRIPT running. Multiple instances are not supported."
    print_message EMSG689 ${SCRIPT} ${SCRIPT}
    $EXIT_CODE=1
fi
exit $EXIT_CODE
